Notes:
Notes:
library(ggplot2)
setwd("~/projects/Classes/FoundationsOfDataScience_sliderule/github/UD651/L4")
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
#qplot(x= age, y= friend_count, data = pf)
qplot(age, friend_count, data = pf)
Response:
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
coord_trans(y="sqrt") +
xlim(13,90)
## Warning: Removed 5192 rows containing missing values (geom_point).
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Response:
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 5168 rows containing missing values (geom_point).
qplot(age, friend_count, data = pf) +
coord_trans(ytrans="sqrt")
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) +
coord_trans(y="sqrt") +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(x = age, y = friendships_initiated + 1), data = pf) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
coord_trans(y="log10") +
xlim(13,90)
## Warning: Removed 5191 rows containing missing values (geom_point).
Notes:
Notes:
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarize(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
library(dplyr)
pf.fc_by_age <- pf %.%
group_by(age) %.%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %.%
arrange(age)
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
head(pf.fc_by_age, 20)
## Source: local data frame [20 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Create your plot!
#ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
# geom_line()
ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(x = age, y = friend_count + 1), data = pf) +
coord_cartesian(xlim = c(13, 90)) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
coord_trans(y="log10")
ggplot(aes(x = age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 70), ylim = c(0,1000)) +
geom_point(alpha = 1/20,
position = position_jitter(h = 0),
color = 'orange') +
geom_line(stat = 'summary', fun.y = median) +
geom_line(stat = 'summary', fun.y = mean, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = .1,
linetype = 2, color = 'purple') +
geom_line(stat = 'summary', fun.y = quantile, probs = .9,
linetype = 2, color = 'purple')
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
?cor.test
cor.test(pf$age, pf$friend_count,
method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf, cor.test(age, friend_count, method="pearson"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
-0.027
Notes:
#with( , cor.test(age, friend_count))
with(subset(pf, age <= 70), cor.test(age, friend_count,
method="pearson"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count,
method="spearman"))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2552934
Notes:
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
#qplot(www_likes_received, www_likes_received/likes_received, data = pf)
#qplot(www_likes_received, likes_received, data = pf) +
# coord_cartesian(xlim = c(0, 12500), ylim = c(0,50000))
#qplot(www_likes_received, likes_received, data = pf) +
# coord_cartesian(xlim = c(0, 2500), ylim = c(0,20000))
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
coord_cartesian(xlim = c(0, 300), ylim = c(0,1250)) +
geom_point(alpha = 1/10,
position = position_jitter(h = 0),
color = 'blue') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(y = pf$www_likes_received)
***
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, 0.95)),
ylim = c(0, quantile(pf$likes_received, 0.95))) +
geom_point(alpha = 1/10,
position = position_jitter(h = 0),
color = 'blue') +
geom_smooth(method = "lm", color = "red") +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(y = pf$www_likes_received)
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
with(pf, cor.test(www_likes_received, likes_received,
method="pearson"))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response: cor 0.9479902
0.948
Notes:
Notes:
#install.packages('alr3')
library(alr3)
## Loading required package: car
data(Mitchell)
?Mitchell
Create your plot!
names(Mitchell)
## [1] "Month" "Temp"
ggplot(aes(x = Month %% 12, y = Temp), data = Mitchell) +
geom_point()
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place)
with(Mitchell, cor.test(Month, Temp))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes:
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point(aes(x = Mitchell$Month %% 12))
range(Mitchell$Month)
## [1] 0 203
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_discrete(breaks = seq(0, 203, 12))
What do you notice? Response: banding and periodicity
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age) +
geom_line()
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age[17:19, ]
## Source: local data frame [3 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- (pf$age + (12 - pf$dob_month) / 12)
Programming Assignment
pf$age_with_months <- (pf$age + (12 - pf$dob_month) / 12)
pf.fc_by_age_months <- pf %.%
group_by(age_with_months) %.%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %.%
arrange(age_with_months)
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
# pf.fc_by_age_months <- pf %>%
# group_by(age_with_months) %>%
# summarise(friend_count_mean = mean(friend_count),
# friend_count_median = median(friend_count),
# n = n()) %>%
# arrange(age_with_months)
qplot(friend_count_mean, data = pf.fc_by_age_months)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
age_with_months_groups <- group_by(pf, age_with_months)
pf.fc_by_age_months2 <- summarize(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months2 <- arrange(pf.fc_by_age_months2, age_with_months)
ggplot(aes(x = age_with_months, y = friend_count_mean),
data = pf.fc_by_age_months) +
coord_cartesian(xlim = c(12, 71)) +
geom_point(alpha = 68/100,
position = position_jitter(h = 0),
color = 'blue')
ggplot(aes(x = age_with_months, y = friend_count_mean),
data = pf.fc_by_age_months) +
coord_cartesian(xlim = c(12, 71)) +
geom_line(color = 'blue')
#
ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line(color = 'blue')
```
Notes:
library(gridExtra)
p1 = ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p2 = ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 = ggplot(aes(x = round(age / 5) * 5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = 'summary', fun.y = mean)
grid.arrange(p2, p1, p3, ncol = 1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Notes:
you don’t have to choose in EDA. exploratory!
Reflection:
Lotsa graphing tools exist in R for EDA of two variables. Many more options and features seem to be available in dplyr and ggplot2 to investigate.
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!